from google.colab import drive #to mount gdrive on collab
drive.mount('/content/drive')
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import glob
path = '/content/drive/My Drive/archive/' #to load file path to collab
files = glob.glob(path + "/*.csv")
df_tweet = pd.DataFrame()
for file in files: #append all files ie 2017,2018,2019,2020,2021 and 2022 together
df_temp = pd.read_csv(file)
df_tweet = df_tweet.append(df_temp, ignore_index=True)
df_tweet
import datetime
df_tweet['Year'] = pd.DatetimeIndex(df_tweet['date']).year #to add year column in the data frame
df_tweet.head()
l=[t.lower() for t in df_tweet['tweet']]
df_tweet['tweet']=l
df_tweet_2017 = df_tweet[(df_tweet.Year == 2017)] #to create data frame of the tweets belonging to year 2017
df_tweet_2018 = df_tweet[(df_tweet.Year == 2018)] #to create data frame of the tweets belonging to year 2018
df_tweet_2019 = df_tweet[(df_tweet.Year == 2019)] #to create data frame of the tweets belonging to year 2019
df_tweet_2020 = df_tweet[(df_tweet.Year == 2020)] #to create data frame of the tweets belonging to year 2020
df_tweet_2021 = df_tweet[(df_tweet.Year == 2021)] #to create data frame of the tweets belonging to year 2021
df_tweet_2022 = df_tweet[(df_tweet.Year == 2022)] #to create data frame of the tweets belonging to year 2022
def tw(x): #function to combine all tweets in a given year
all_tweets=''
for tweet in x['tweet']:
all_tweets += tweet + ' '
return all_tweets
tweet_2017 = tw(df_tweet_2017) #all tweets for the year 2017
tweet_2018 = tw(df_tweet_2018) #all tweets for the year 2018
tweet_2019 = tw(df_tweet_2019) #all tweets for the year 2019
tweet_2020 = tw(df_tweet_2020) #all tweets for the year 2020
tweet_2021 = tw(df_tweet_2021) #all tweets for the year 2021
tweet_2022 = tw(df_tweet_2022) #all tweets for the year 2022
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def word_filter(x): #function to filter stopwords from tweets
words = word_tokenize(x)
stop_words = set(stopwords.words('english'))
stop_words.update(['@', '.', ':', ';', '&', 'I', ',', 'A', '!', '’', ')', '(', '🤣', '?', 'We', 'In', 'It','The', '“', '”', 'This', 'If','https','\'s','n\'t','...','3','🤣🤣','…'])
words_filtered = []
for word in words:
if word not in stop_words:
words_filtered.append(word)
return words_filtered
1. Word Frequencies for Each Year
tweet_2017_freq = nltk.FreqDist(word_filter(tweet_2017)) #to find frequency of words in a given tweet
tweet_2018_freq = nltk.FreqDist(word_filter(tweet_2018)) #to find frequency of words in a given tweet
tweet_2019_freq = nltk.FreqDist(word_filter(tweet_2019)) #to find frequency of words in a given tweet
tweet_2020_freq = nltk.FreqDist(word_filter(tweet_2020)) #to find frequency of words in a given tweet
tweet_2021_freq = nltk.FreqDist(word_filter(tweet_2021)) #to find frequency of words in a given tweet
tweet_2022_freq = nltk.FreqDist(word_filter(tweet_2022)) #to find frequency of words in a given tweet
tweet_2017_freq.tabulate(10) #Word Frequencies for 2017
tweet_2018_freq.tabulate(10) #Word Frequencies for 2018
tweet_2019_freq.tabulate(10) #Word Frequencies for 2019
tweet_2020_freq.tabulate(10) #Word Frequencies for 2020
tweet_2021_freq.tabulate(10) #Word Frequencies for 2021
tweet_2022_freq.tabulate(10) #Word Frequencies for 2022
2. Top 10 words (for each year) by the highest value of Word Frequency:
import matplotlib.pyplot as plt
import seaborn as sns
def frequency_plot(x): #function to plot frequency plot
t=pd.DataFrame(x.most_common(),columns=['Key','value'])
sns.histplot(t['value'],bins=100)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.figure(figsize=(15,15))
plt.subplot(3,2,1)
frequency_plot(tweet_2017_freq) #Frequency Plot for 2017 using frequency_plot function
plt.title("2017")
plt.subplot(3,2,2)
frequency_plot(tweet_2018_freq) #Frequency Plot for 2018 using frequency_plot function
plt.title("2018")
plt.subplot(3,2,3)
frequency_plot(tweet_2019_freq) #Frequency Plot for 2019 using frequency_plot function
plt.title("2019")
plt.subplot(3,2,4)
frequency_plot(tweet_2020_freq) #Frequency Plot for 2020 using frequency_plot function
plt.title("2020")
plt.subplot(3,2,5)
frequency_plot(tweet_2021_freq) #Frequency Plot for 2021 using frequency_plot function
plt.title("2021")
plt.subplot(3,2,6)
frequency_plot(tweet_2022_freq) #Frequency Plot for 2022 using frequency_plot function
plt.title("2022")
3. Histogram of Word Frequencies for Each Year
import scipy.stats as ss
import math
def log_plot(x): #to generate log-log plot
temp_df2=pd.DataFrame(x.most_common(),columns=['Word','Count']) # Generating a dataframe with the word count
temp_df2['rank']=range(1,temp_df2.shape[0]+1) # as the wor and rand counts are sorted in descending order creating a rank variable
plt.loglog(temp_df2['rank'], temp_df2['Count']) # Using log log to plot the log plots
plt.xlabel('Rank')
plt.ylabel('Word Frequency')
plt.figure(figsize=(15,15))
plt.subplot(3,2,1)
log_plot(tweet_2017_freq)
plt.title('2017') #log-log plot for 2017
plt.subplot(3,2,2)
log_plot(tweet_2018_freq)
plt.title('2018') #log-log plot for 2018
plt.subplot(3,2,3)
log_plot(tweet_2019_freq)
plt.title('2019') #log-log plot for 2019
plt.subplot(3,2,4)
log_plot(tweet_2020_freq)
plt.title("2020") #log-log plot for 2020
plt.subplot(3,2,5)
log_plot(tweet_2021_freq)
plt.title("2021") #log-log plot for 2021
plt.subplot(3,2,6)
log_plot(tweet_2022_freq) #log-log plot for 2022
plt.title("2022")
pip install networkx==2.6.3 #to install networkx library version 2.6.3
import networkx as nx
def bigram_graph(x): #function to generate bigrams for top 200 words
from nltk.corpus import stopwords
stoplist = stopwords.words('english') + ['@', '.', ':', ';', '&', 'I', ',', 'A', '!', '’', ')', '(', '🤣', '?', 'We', 'In', 'It','The', '“', '”', 'This', 'If','https','\'s','n\'t','...','3','🤣🤣','…']
# Generating list of stop words
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3)) # Creating a bi gram matrx
ngrams = c_vec.fit_transform(x['tweet'])
count_values = ngrams.toarray().sum(axis=0)
vocab = c_vec.vocabulary_
df_bigram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
).rename(columns={0: 'frequency', 1:'bigram'})
# SPlitting the bigram dataframe and
bi_1=[]
bi_2=[]
for i in range(0, len(df_bigram)): # Iterating through the bigrams and splitting into seprate lists to create a dataframe with the bugrams and frequency
l=df_bigram['bigram'][i].split()
bi_1.append(l[0])
bi_2.append(l[1])
df_bigram['bi_1']=bi_1 # Assigning the list to a new column in the bigram df
df_bigram['bi_2']=bi_2
df_bigram.drop('bigram',axis=1,inplace=True) # Dropping the previouslt assinged bigram matrix
df_bigram=df_bigram.iloc[:200]
g = nx.from_pandas_edgelist(df_bigram, source='bi_1', target='bi_2',edge_attr='frequency')
nx.draw_random(g,with_labels=True)
plt.gcf().set_size_inches(15, 15)
plt.show()
bigram_graph(df_tweet_2017) #bigram for year 2017
bigram_graph(df_tweet_2018) #bigram for year 2018
bigram_graph(df_tweet_2019) #bigram for year 2019
bigram_graph(df_tweet_2020) #bigram for year 2020
bigram_graph(df_tweet_2021) #bigram for year 2021
bigram_graph(df_tweet_2022) #bigram for year 2022
!jupyter nbconvert -- to html Group_53_Project_3_task2.ipynb